library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.4.1 ✔ purrr 0.3.4
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.1
## ✔ readr 2.1.2
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(ggcorrplot)
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ROSE)
## Loaded ROSE 0.0-4
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data <- read_csv("https://raw.githubusercontent.com/Lanxi-Ada/Lanxi/master/healthcare-dataset-stroke-data.csv")
## Rows: 5110 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): gender, ever_married, work_type, Residence_type, bmi, smoking_status
## dbl (6): id, age, hypertension, heart_disease, avg_glucose_level, stroke
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# data
Shiny
library(shiny)
library(plotly)
library(RColorBrewer)
ui <- fluidPage(
titlePanel("Stroke Data Visualization"),
tabsetPanel(
tabPanel("Scatterplot",
plotlyOutput(outputId = "scatterplot"),
selectInput(inputId = "scatter_x", label = "X-axis:", choices = c("age","avg_glucose_level","bmi"), selected = "age"),
selectInput(inputId = "scatter_y", label = "Y-axis:", choices = c("age","avg_glucose_level","bmi"), selected = "bmi")),
tabPanel("Pie Chart",
selectInput(inputId = "pie_var", label = "Variable:", choices = c("gender", "ever_married", "work_type", "Residence_type", "smoking_status"), selected = "smoking_status"),
plotlyOutput(outputId = "piechart"))
)
)
server <- function(input, output) {
plot_data <- reactive({
data %>%
group_by(!!sym(input$pie_var)) %>%
summarize(count = n(), stroke_count = sum(stroke)) %>%
arrange(desc(count))
})
output$scatterplot <- renderPlotly({
plot_ly(data, x = ~get(input$scatter_x), y = ~get(input$scatter_y), color = ~stroke, type = "scatter", mode = "markers") %>%
layout(title = paste(input$scatter_x, "vs.", input$scatter_y),
xaxis = list(title = input$scatter_x),
yaxis = list(title = input$scatter_y))
})
output$piechart <- renderPlotly({
colors <- brewer.pal(length(plot_data()), "Set3")
plot_ly(plot_data(), labels = plot_data()[[1]], values = plot_data()[[3]] / plot_data()[[2]], type = "pie", hole = 0.6, textfont = list(family = "Arial"), marker = list(colors = colors, line = list(color = "#FFFFFF", width = 1))) %>%
layout(title = paste(input$pie_var, "and Stroke"),
showlegend = TRUE,
legend = list(orientation = "h", x = 0.5, y = -0.1),
paper_bgcolor = "#FFFFFF",
plot_bgcolor = "#FFFFFF")
})
}
# shinyApp(ui, server)
Data processing
data1 <- data %>%
mutate(gender = replace(gender, gender == "Male", 1)) %>%
mutate(gender = replace(gender, gender == "Female", 0)) %>%
mutate(ever_married = replace(ever_married, ever_married == "Yes", 1)) %>%
mutate(ever_married = replace(ever_married, ever_married == "No", 0)) %>%
mutate(work_type = replace(work_type, work_type == "Never_worked", 0)) %>%
mutate(work_type = replace(work_type, work_type == "children", 1)) %>%
mutate(work_type = replace(work_type, work_type == "Govt_job", 2)) %>%
mutate(work_type = replace(work_type, work_type == "Self-employed", 3)) %>%
mutate(work_type = replace(work_type, work_type == "Private", 4)) %>%
mutate(Residence_type = replace(Residence_type, Residence_type == "Urban", 1)) %>%
mutate(Residence_type = replace(Residence_type, Residence_type == "Rural", 0)) %>%
mutate(bmi = replace(bmi, bmi == "N/A", NA)) %>%
mutate(smoking_status = replace(smoking_status, smoking_status == "Unknown", NA)) %>%
mutate(smoking_status = replace(smoking_status, smoking_status == "formerly smoked", 1)) %>%
mutate(smoking_status = replace(smoking_status, smoking_status == "never smoked", 0)) %>%
mutate(smoking_status = replace(smoking_status, smoking_status == "smokes", 2))
Missing value interpolation
library(mice)
data1 <- as.data.frame(sapply(data1, as.numeric))
## Warning in lapply(X = X, FUN = FUN, ...): 强制改变过程中产生了NA
imputed_data <- mice(data1[-1], m = 5, maxit = 50, method = "pmm")
##
## iter imp variable
## 1 1 gender bmi smoking_status
## 1 2 gender bmi smoking_status
## 1 3 gender bmi smoking_status
## 1 4 gender bmi smoking_status
## 1 5 gender bmi smoking_status
## 2 1 gender bmi smoking_status
## 2 2 gender bmi smoking_status
## 2 3 gender bmi smoking_status
## 2 4 gender bmi smoking_status
## 2 5 gender bmi smoking_status
## 3 1 gender bmi smoking_status
## 3 2 gender bmi smoking_status
## 3 3 gender bmi smoking_status
## 3 4 gender bmi smoking_status
## 3 5 gender bmi smoking_status
## 4 1 gender bmi smoking_status
## 4 2 gender bmi smoking_status
## 4 3 gender bmi smoking_status
## 4 4 gender bmi smoking_status
## 4 5 gender bmi smoking_status
## 5 1 gender bmi smoking_status
## 5 2 gender bmi smoking_status
## 5 3 gender bmi smoking_status
## 5 4 gender bmi smoking_status
## 5 5 gender bmi smoking_status
## 6 1 gender bmi smoking_status
## 6 2 gender bmi smoking_status
## 6 3 gender bmi smoking_status
## 6 4 gender bmi smoking_status
## 6 5 gender bmi smoking_status
## 7 1 gender bmi smoking_status
## 7 2 gender bmi smoking_status
## 7 3 gender bmi smoking_status
## 7 4 gender bmi smoking_status
## 7 5 gender bmi smoking_status
## 8 1 gender bmi smoking_status
## 8 2 gender bmi smoking_status
## 8 3 gender bmi smoking_status
## 8 4 gender bmi smoking_status
## 8 5 gender bmi smoking_status
## 9 1 gender bmi smoking_status
## 9 2 gender bmi smoking_status
## 9 3 gender bmi smoking_status
## 9 4 gender bmi smoking_status
## 9 5 gender bmi smoking_status
## 10 1 gender bmi smoking_status
## 10 2 gender bmi smoking_status
## 10 3 gender bmi smoking_status
## 10 4 gender bmi smoking_status
## 10 5 gender bmi smoking_status
## 11 1 gender bmi smoking_status
## 11 2 gender bmi smoking_status
## 11 3 gender bmi smoking_status
## 11 4 gender bmi smoking_status
## 11 5 gender bmi smoking_status
## 12 1 gender bmi smoking_status
## 12 2 gender bmi smoking_status
## 12 3 gender bmi smoking_status
## 12 4 gender bmi smoking_status
## 12 5 gender bmi smoking_status
## 13 1 gender bmi smoking_status
## 13 2 gender bmi smoking_status
## 13 3 gender bmi smoking_status
## 13 4 gender bmi smoking_status
## 13 5 gender bmi smoking_status
## 14 1 gender bmi smoking_status
## 14 2 gender bmi smoking_status
## 14 3 gender bmi smoking_status
## 14 4 gender bmi smoking_status
## 14 5 gender bmi smoking_status
## 15 1 gender bmi smoking_status
## 15 2 gender bmi smoking_status
## 15 3 gender bmi smoking_status
## 15 4 gender bmi smoking_status
## 15 5 gender bmi smoking_status
## 16 1 gender bmi smoking_status
## 16 2 gender bmi smoking_status
## 16 3 gender bmi smoking_status
## 16 4 gender bmi smoking_status
## 16 5 gender bmi smoking_status
## 17 1 gender bmi smoking_status
## 17 2 gender bmi smoking_status
## 17 3 gender bmi smoking_status
## 17 4 gender bmi smoking_status
## 17 5 gender bmi smoking_status
## 18 1 gender bmi smoking_status
## 18 2 gender bmi smoking_status
## 18 3 gender bmi smoking_status
## 18 4 gender bmi smoking_status
## 18 5 gender bmi smoking_status
## 19 1 gender bmi smoking_status
## 19 2 gender bmi smoking_status
## 19 3 gender bmi smoking_status
## 19 4 gender bmi smoking_status
## 19 5 gender bmi smoking_status
## 20 1 gender bmi smoking_status
## 20 2 gender bmi smoking_status
## 20 3 gender bmi smoking_status
## 20 4 gender bmi smoking_status
## 20 5 gender bmi smoking_status
## 21 1 gender bmi smoking_status
## 21 2 gender bmi smoking_status
## 21 3 gender bmi smoking_status
## 21 4 gender bmi smoking_status
## 21 5 gender bmi smoking_status
## 22 1 gender bmi smoking_status
## 22 2 gender bmi smoking_status
## 22 3 gender bmi smoking_status
## 22 4 gender bmi smoking_status
## 22 5 gender bmi smoking_status
## 23 1 gender bmi smoking_status
## 23 2 gender bmi smoking_status
## 23 3 gender bmi smoking_status
## 23 4 gender bmi smoking_status
## 23 5 gender bmi smoking_status
## 24 1 gender bmi smoking_status
## 24 2 gender bmi smoking_status
## 24 3 gender bmi smoking_status
## 24 4 gender bmi smoking_status
## 24 5 gender bmi smoking_status
## 25 1 gender bmi smoking_status
## 25 2 gender bmi smoking_status
## 25 3 gender bmi smoking_status
## 25 4 gender bmi smoking_status
## 25 5 gender bmi smoking_status
## 26 1 gender bmi smoking_status
## 26 2 gender bmi smoking_status
## 26 3 gender bmi smoking_status
## 26 4 gender bmi smoking_status
## 26 5 gender bmi smoking_status
## 27 1 gender bmi smoking_status
## 27 2 gender bmi smoking_status
## 27 3 gender bmi smoking_status
## 27 4 gender bmi smoking_status
## 27 5 gender bmi smoking_status
## 28 1 gender bmi smoking_status
## 28 2 gender bmi smoking_status
## 28 3 gender bmi smoking_status
## 28 4 gender bmi smoking_status
## 28 5 gender bmi smoking_status
## 29 1 gender bmi smoking_status
## 29 2 gender bmi smoking_status
## 29 3 gender bmi smoking_status
## 29 4 gender bmi smoking_status
## 29 5 gender bmi smoking_status
## 30 1 gender bmi smoking_status
## 30 2 gender bmi smoking_status
## 30 3 gender bmi smoking_status
## 30 4 gender bmi smoking_status
## 30 5 gender bmi smoking_status
## 31 1 gender bmi smoking_status
## 31 2 gender bmi smoking_status
## 31 3 gender bmi smoking_status
## 31 4 gender bmi smoking_status
## 31 5 gender bmi smoking_status
## 32 1 gender bmi smoking_status
## 32 2 gender bmi smoking_status
## 32 3 gender bmi smoking_status
## 32 4 gender bmi smoking_status
## 32 5 gender bmi smoking_status
## 33 1 gender bmi smoking_status
## 33 2 gender bmi smoking_status
## 33 3 gender bmi smoking_status
## 33 4 gender bmi smoking_status
## 33 5 gender bmi smoking_status
## 34 1 gender bmi smoking_status
## 34 2 gender bmi smoking_status
## 34 3 gender bmi smoking_status
## 34 4 gender bmi smoking_status
## 34 5 gender bmi smoking_status
## 35 1 gender bmi smoking_status
## 35 2 gender bmi smoking_status
## 35 3 gender bmi smoking_status
## 35 4 gender bmi smoking_status
## 35 5 gender bmi smoking_status
## 36 1 gender bmi smoking_status
## 36 2 gender bmi smoking_status
## 36 3 gender bmi smoking_status
## 36 4 gender bmi smoking_status
## 36 5 gender bmi smoking_status
## 37 1 gender bmi smoking_status
## 37 2 gender bmi smoking_status
## 37 3 gender bmi smoking_status
## 37 4 gender bmi smoking_status
## 37 5 gender bmi smoking_status
## 38 1 gender bmi smoking_status
## 38 2 gender bmi smoking_status
## 38 3 gender bmi smoking_status
## 38 4 gender bmi smoking_status
## 38 5 gender bmi smoking_status
## 39 1 gender bmi smoking_status
## 39 2 gender bmi smoking_status
## 39 3 gender bmi smoking_status
## 39 4 gender bmi smoking_status
## 39 5 gender bmi smoking_status
## 40 1 gender bmi smoking_status
## 40 2 gender bmi smoking_status
## 40 3 gender bmi smoking_status
## 40 4 gender bmi smoking_status
## 40 5 gender bmi smoking_status
## 41 1 gender bmi smoking_status
## 41 2 gender bmi smoking_status
## 41 3 gender bmi smoking_status
## 41 4 gender bmi smoking_status
## 41 5 gender bmi smoking_status
## 42 1 gender bmi smoking_status
## 42 2 gender bmi smoking_status
## 42 3 gender bmi smoking_status
## 42 4 gender bmi smoking_status
## 42 5 gender bmi smoking_status
## 43 1 gender bmi smoking_status
## 43 2 gender bmi smoking_status
## 43 3 gender bmi smoking_status
## 43 4 gender bmi smoking_status
## 43 5 gender bmi smoking_status
## 44 1 gender bmi smoking_status
## 44 2 gender bmi smoking_status
## 44 3 gender bmi smoking_status
## 44 4 gender bmi smoking_status
## 44 5 gender bmi smoking_status
## 45 1 gender bmi smoking_status
## 45 2 gender bmi smoking_status
## 45 3 gender bmi smoking_status
## 45 4 gender bmi smoking_status
## 45 5 gender bmi smoking_status
## 46 1 gender bmi smoking_status
## 46 2 gender bmi smoking_status
## 46 3 gender bmi smoking_status
## 46 4 gender bmi smoking_status
## 46 5 gender bmi smoking_status
## 47 1 gender bmi smoking_status
## 47 2 gender bmi smoking_status
## 47 3 gender bmi smoking_status
## 47 4 gender bmi smoking_status
## 47 5 gender bmi smoking_status
## 48 1 gender bmi smoking_status
## 48 2 gender bmi smoking_status
## 48 3 gender bmi smoking_status
## 48 4 gender bmi smoking_status
## 48 5 gender bmi smoking_status
## 49 1 gender bmi smoking_status
## 49 2 gender bmi smoking_status
## 49 3 gender bmi smoking_status
## 49 4 gender bmi smoking_status
## 49 5 gender bmi smoking_status
## 50 1 gender bmi smoking_status
## 50 2 gender bmi smoking_status
## 50 3 gender bmi smoking_status
## 50 4 gender bmi smoking_status
## 50 5 gender bmi smoking_status
completed_data <- complete(imputed_data)
Balanced data
library(ROSE)
table(completed_data$stroke)
##
## 0 1
## 4861 249
# Upsampling using the ROSE package
final_completed_data <- ovun.sample(stroke ~ ., data = completed_data , method = "over", N = 2 * nrow(completed_data), seed = 123)$data
# View the distribution of processed data
table(final_completed_data$stroke)
##
## 0 1
## 4861 5359
Correlation Matrix
library(ggcorrplot)
data_com <- na.omit(final_completed_data)
corr_matrix <- cor(data_com)
ggcorrplot(corr_matrix, type = "lower",
lab = TRUE, lab_size = 2.5,
colors = c("#6D9EC1", "white", "#E46726"),
title = "Correlation Matrix of Stroke Data")

Dimension reduction
PCA
pca_result <- prcomp(final_completed_data, scale=FALSE)
pca_result$rotation
## PC1 PC2 PC3 PC4
## gender -0.0009271916 0.0005642831 -0.002566153 -0.0525983190
## age -0.1210621368 -0.9913367507 -0.045942263 0.0140098645
## hypertension -0.0011748706 -0.0044066799 0.001930888 0.0411412038
## heart_disease -0.0015532465 -0.0027900786 -0.001423262 -0.0105980154
## ever_married -0.0016988450 -0.0102081371 0.007986579 -0.0795132770
## work_type -0.0021032500 -0.0150272258 0.028055589 -0.9276077375
## Residence_type -0.0002940283 -0.0005575868 0.001906882 0.0084567448
## avg_glucose_level -0.9920837408 0.1224532394 -0.027762665 -0.0004275118
## bmi -0.0330965831 -0.0417152999 0.998097945 0.0296507844
## smoking_status -0.0006309870 -0.0027356905 0.007022750 -0.3558678027
## stroke -0.0024452980 -0.0122940750 -0.001685058 0.0293425765
## PC5 PC6 PC7 PC8
## gender 0.1007643237 -2.341295e-01 0.9612314409 0.0007183936
## age 0.0017704409 -6.504037e-04 0.0005503099 -0.0116356263
## hypertension -0.0111160526 -2.943461e-02 -0.0120686306 0.4100541900
## heart_disease 0.0669098119 1.679558e-02 0.0897131340 0.1102859363
## ever_married 0.0167556700 -9.598566e-02 -0.0225282506 -0.2342113464
## work_type -0.3598320762 3.798557e-02 -0.0034393024 0.0696795074
## Residence_type 0.0551627496 9.645333e-01 0.2266024491 -0.0620129283
## avg_glucose_level -0.0005252629 7.270765e-05 -0.0011484805 -0.0011139388
## bmi 0.0039165131 -2.464268e-03 0.0033516089 0.0004099522
## smoking_status 0.9218908083 -1.993013e-02 -0.1263188524 -0.0154968361
## stroke 0.0504167148 5.148263e-02 0.0016402236 0.8693441879
## PC9 PC10 PC11
## gender -1.740453e-04 -0.0339455290 0.084479793
## age -4.094229e-03 0.0095755463 0.006382467
## hypertension -8.397053e-01 -0.3483500898 -0.050497375
## heart_disease -5.882168e-02 0.3946537433 -0.903157486
## ever_married 2.556533e-01 -0.8332078317 -0.411164643
## work_type -2.646306e-02 0.0430270962 0.013631380
## Residence_type -2.428277e-02 -0.1035693063 -0.006815090
## avg_glucose_level 3.685793e-05 0.0000802532 0.001121660
## bmi 1.211828e-03 0.0069348805 0.001843706
## smoking_status -4.516642e-02 0.0155514676 0.067398632
## stroke 4.719408e-01 -0.1206377949 0.027260649
# select the number of principal component
screeplot(pca_result, type="lines")

pca_df <- data.frame(predict(pca_result, final_completed_data))[c(1,2,3)]
df <- cbind(final_completed_data, pca_df)
library(plotly)
df$stroke <- as.factor(df$stroke)
plot_ly(df, x=~PC1, y=~PC2, z=~PC3, color=~stroke,marker=list(size=3,opacity=0.1))
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
# Assume that pca_result is the result of PCA analysis that has been performed
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_contrib(pca_result, choice = "var", axes = 1:9, top = 10)

Umap
library(umap)
data_matrix <- as.matrix(final_completed_data)
umap_result <- umap(data_matrix)
library(ggplot2)
umap_df <- data.frame(X1 = umap_result$layout[,1], X2 = umap_result$layout[,2])
umap_df1 <- cbind(umap_df,final_completed_data)
ggplot(umap_df1, aes(X1,X2)) +
geom_point(aes(color = stroke), alpha = 0.7, size = 1.5)

Divide the training set test set
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(123)
trainIndex <- createDataPartition(umap_df1$stroke, p = 0.8, list = FALSE)
# Divide the data set into 80% training set and 20% test set
train <- umap_df1[trainIndex, ]
test <- umap_df1[-trainIndex, ]
Training SVM
library(e1071)
# SVM Model
svm_model <- svm(stroke ~ X1+X2, data = train, kernel = "radial", cost = 10, probability = TRUE)
svm_pred <- predict(svm_model, test, probability = TRUE)
svm_pred1 <- ifelse(svm_pred > 0.5, 1, 0)
confusionMatrix(as.factor(svm_pred1), as.factor(test$stroke))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 711 151
## 1 233 949
##
## Accuracy : 0.8121
## 95% CI : (0.7945, 0.8289)
## No Information Rate : 0.5382
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6197
##
## Mcnemar's Test P-Value : 3.573e-05
##
## Sensitivity : 0.7532
## Specificity : 0.8627
## Pos Pred Value : 0.8248
## Neg Pred Value : 0.8029
## Prevalence : 0.4618
## Detection Rate : 0.3478
## Detection Prevalence : 0.4217
## Balanced Accuracy : 0.8080
##
## 'Positive' Class : 0
##
Predict
result <- data.frame(actual = test$stroke, predict = svm_pred1)
result <- cbind(result,test)
library(plotly)
plot_ly(result, x=~X1, y=~X2, color=~factor(predict), text = ~paste("Actual:", factor(actual)), showlegend = TRUE,marker=list(size=7,opacity=0.8)) %>%
layout(title = "Predict result in test data")
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels